The various methods and algorithms of machine learning are explored and tested with the purpose to regionalize specific capacity for the study area of the Lake Chad Basin. Specific capacity was obtained from puming tests at 1387 locations.Various geoscientific parameters are used and tested for training and building the models, such as elevation, river density, lithology, etc. But before getting started, some presettings are necessary to simplify coding and reproducability.

1 Environment Presettings

1.1 Set working directory

Set the working directory relative to the folder containing the source script.

setwd(substr(
  rstudioapi::getActiveDocumentContext()$path,
  1,
  rev(gregexpr(
    "/", rstudioapi::getActiveDocumentContext()$path
  )[[1]])[2]
))

1.2 External Functions

Often needed parts of the scipt were outsourced to external functions. This simplifies and shortens the code for a more comprehensible reading. The following lines source these functions from the folder called functions with a relative path to the active documents path.

purrr::map(
  list.files(
    path = './scripts/functions',
    pattern = "\\.R$",
    full.names = TRUE,
    recursive = TRUE
  ),
  source
)

1.3 Required Packages

Install and load all packages within the brackets of the pacman::p_load() function. Note: Load the tidyverse package last to prevent functions to be masked by other packages.

if (!require("pacman"))
  install.packages("pacman")
pacman::p_load(
  readxl,
  janitor,
  ggplot2,
  kable,
  kableExtra,
  png,
  FSA,
  sp,
  strict,
  glue,
  leaflet,
  leaflet.extras,
  htmlwidgets,
  htmltools,
  plotly,
  RANN,
  plyr,
  dplyr,
  spdplyr,
  rgeos,
  rgdal,
  raster,
  tidyverse
)

2 Data Import

2.1 Stammdaten

stammdaten <- readxl::read_excel('./raw_data/Datensatz/Stammdaten_D.xlsx') %>% 
  as_tibble() %>% 
  clean_names()

names(stammdaten) <- c('obswell_id', 'name', 'lon', 'lat')

2.2 Hydrogeological Units

2.2.1 First Time

Load hydrogeological unit data

# hydr_unit <- readOGR('./raw_data/hyraum_v32/hyraum_gr__v32_poly.shp', encoding = "UTF-8/LATIN-1/...") %>% 
#   clean_names() %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
# 
# hydr_unit <- hydr_unit %>% 
#   dplyr::mutate(gr_name = str_replace_all(gr_name, c("ü" = "ue", "ö" = "oe", "ä" = "ae", "ß" = "sz"))) %>% 
#   dplyr::mutate(gr_nr_name = str_replace_all(gr_nr_name, c("ü" = "ue", "ö" = "oe", "ä" = "ae", "ß" = "sz")))
# 
# hydr_unit %>% writeOGR('./processed_data/hydr_unit_umlaute_rm.shp', layer = 'hydr_unit_umlaute_rm', driver="ESRI Shapefile")

2.2.2 Second Time

Load hydrogeological unit data

hydr_unit <- readOGR('./processed_data/hydr_unit_umlaute_rm.shp') %>% 
  clean_names() %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\processed_data\hydr_unit_umlaute_rm.shp", layer: "hydr_unit_umlaute_rm"
## with 11 features
## It has 5 fields

2.3 Water production Sites

Data on groundwater pumping stations was obtained from the HAD Thema 7.2 provided by Bundesanstalt für Gewässerkunde (BfG) Source: „Hydrologischer Atlas von Deutschland/BfG, 2000“

Import

public_water_prod_sp <- readOGR('./raw_data/had/off_wassergewinnung.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\off_wassergewinnung.shp", layer: "off_wassergewinnung"
## with 1230 features
## It has 33 fields
## Integer64 fields read as strings:  TK X_LAMBERT Y_LAMBERT
mining_water_prod_sp <- readOGR('./raw_data/had/bergbau_wassergewinnung.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\bergbau_wassergewinnung.shp", layer: "bergbau_wassergewinnung"
## with 379 features
## It has 30 fields
## Integer64 fields read as strings:  TK X_LAMBERT Y_LAMBERT
powerplant_water_prod_sp <- readOGR('./raw_data/had/kraftwerke.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\kraftwerke.shp", layer: "kraftwerke"
## with 62 features
## It has 31 fields
## Integer64 fields read as strings:  TK X_LAMBERT Y_LAMBERT
reservoir_sp <- readOGR('./raw_data/had/talsperre.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\talsperre.shp", layer: "talsperre"
## with 64 features
## It has 47 fields
## Integer64 fields read as strings:  LFD_NR R_WERT H_WERT XLAMBERT YLAMBERT NR_HAD

3 Data Preparation

3.1 Start date of time series

3.1.1 First time

First we list all files in the directory

# files_list <- list.files(path = "./raw_data/Datensatz/Messstellen preprocessed/", pattern = "*.txt")

Then we apply a custom function to read in all files, bind them and extract the lines with first observation grouped by the wells This implementation to find the start date is very slow, next time it would be worth trying another method.

# start_date <- files_list %>% extract_start_date()

Now we ungroup the dataframe

# start_date <- start_date %>% ungroup()

# names(start_date) <- c(names(stammdaten)[1], 'name', 'start_date', 'value')

As the previous steps took a few ours, we store the result as .csv

# start_date %>% write_csv2(path = './processed_data/start_date.csv')

3.1.2 Second time continue here…

Read in start_date.csv

start_date <- read_csv2('./processed_data/start_date.csv')

3.2 Data Wrangling

Join the 2 dataframes by their common obswell_id

stammdaten_join_startdate <- stammdaten %>%
  dplyr::select(-name) %>% 
  left_join(start_date, by = 'obswell_id')

Show example entries

stammdaten_join_startdate %>% headtail()
##          obswell_id    lon     lat                        name start_date
## 1       BB_25470023 808527 5928687               Ottenhagen OP 2000-11-20
## 2       BB_25470024 808530 5928686               Ottenhagen UP 2000-12-18
## 3       BB_25480025 820407 5933409                 Neumannshof 2000-11-20
## 13490 TH_5633900114 660999 5581364       Hy Heinersdorf 1_2004 2007-01-08
## 13491 TH_5729240555 616545 5566272 Hy Schweickershausen 1_2002 2007-01-08
## 13492 TH_5730000077 625563 5568125          Br.Lindenau (0006) 2007-01-08
##         value
## 1       78.96
## 2       78.78
## 3       34.93
## 13490 429.972
## 13491   348.9
## 13492  281.26

Now we separate the coordinates from the dataframe

coords <- stammdaten_join_startdate %>% 
  dplyr::select(lon, lat)

Show example entries

coords %>% headtail()
##          lon     lat
## 1     808527 5928687
## 2     808530 5928686
## 3     820407 5933409
## 13490 660999 5581364
## 13491 616545 5566272
## 13492 625563 5568125

Delete coordinate columns from dataframe

stammdaten_join_startdate <- stammdaten_join_startdate %>% 
  dplyr::select(-one_of(colnames(coords)))

Show example entries

stammdaten_join_startdate %>% headtail()
##          obswell_id                        name start_date   value
## 1       BB_25470023               Ottenhagen OP 2000-11-20   78.96
## 2       BB_25470024               Ottenhagen UP 2000-12-18   78.78
## 3       BB_25480025                 Neumannshof 2000-11-20   34.93
## 13490 TH_5633900114       Hy Heinersdorf 1_2004 2007-01-08 429.972
## 13491 TH_5729240555 Hy Schweickershausen 1_2002 2007-01-08   348.9
## 13492 TH_5730000077          Br.Lindenau (0006) 2007-01-08  281.26

3.3 Conversion to Spatial Data

Now we create a SpatialPointsdataframe from our stammdaten dataframe

obswell_sp <- SpatialPointsDataFrame(coords = coords,
                                     data = stammdaten_join_startdate,
                                     proj4string = CRS('+proj=utm +zone=32 +ellps=GRS80 +units=m +no_defs'))

The coodinate reference system needs to be transformed to the one leaflet is expecting

obswell_sp <- obswell_sp %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

4 Visualization

We create a color palette from the start dates

pal <- colorNumeric(
  palette = "Spectral",
  domain = as.numeric(obswell_sp$start_date))

We create a leaflet html GIS

map1 <- obswell_sp %>% create_custom_html_leafletmap1()

Plot the leaflet GIS

map1

Save the leaflet widget

saveWidget(map1, file='map1.html')

5 Selection of Training Data

For the classification of the time series as class anthropogenic or class natural training data is needed The selection of the training data for these two classes is described in the following chapter

Water sources per federal state in Germany

Water sources per federal state in Germany, Source: Grundwasser in Deutschland - Bundesministerium für Umwelt, Naturschutz und Reaktorsicherheit (BMU), 2008




Withdrawal of water for irrigation grouped by groundwater and surface water

Withdrawal of water for irrigation grouped by groundwater and surface water, Source: Statistik in der Wasserversorgung in der Landwirtschaft 2002 - Statistisches Bundesamt, 2004

5.1 Method 2 - Water Production Sites

5.1.1 Data Preparation

CRS Transformation

public_water_prod_sp <- public_water_prod_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

mining_water_prod_sp <- mining_water_prod_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

powerplant_water_prod_sp <- powerplant_water_prod_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

reservoir_sp <- reservoir_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

Adding Source

public_water_prod_sp@data <- public_water_prod_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'public water production')

mining_water_prod_sp@data <- mining_water_prod_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'mining water production')

powerplant_water_prod_sp@data <- powerplant_water_prod_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'powerplant water production')

reservoir_sp@data <- reservoir_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'reservoir')

Specific Manipulation Filter out only types of water production that affect groundwater directly (Grundwasser, Mehrfachentnahme)

mining_water_prod_sp <- mining_water_prod_sp %>% 
  dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')

public_water_prod_sp <- public_water_prod_sp %>% 
  dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')

powerplant_water_prod_sp <- powerplant_water_prod_sp %>% 
  dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')

Join Points

water_production_join_sp <- raster::union(mining_water_prod_sp, public_water_prod_sp)
water_production_join_sp <- raster::union(water_production_join_sp, powerplant_water_prod_sp)

Add unique IDs

water_production_join_sp <- water_production_join_sp %>% 
  dplyr::mutate(id_unique = paste0('id_', seq(1,length.out = base::nrow(water_production_join_sp))))

Plot in Map

pal2 <- colorFactor(palette = 'Set1', domain = base::as.factor(water_production_join_sp$source) %>% base::unique())
labels <- c('powerplant water production', 'mining water production', 'public water production')

Create a leaflet GIS

map2 <- map1 %>% create_custom_html_leafletmap2()

Plot the leaflet GIS

map2

Save the leaflet widget

saveWidget(map2, file='map2.html')

Find nearest neighbors

obswell_sp_coords <- obswell_sp %>% 
  spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')) %>% 
  coordinates() %>% 
  as_tibble()

water_production_join_sp_coords <- water_production_join_sp %>% 
  spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'))  %>% 
  coordinates() %>% 
  as_tibble()

temp_result <- water_production_join_sp_coords %>% 
  nn2(obswell_sp_coords, k = 1)

nearest_neighbors <- tibble(index = temp_result$nn.idx[,1],
                            distance = temp_result$nn.dists[,1])

obswell_sp <- obswell_sp %>% 
  dplyr::mutate(nn_index = nearest_neighbors$index,
                nn_distance = nearest_neighbors$distance)

Plot the distribution of the distance

nearest_neighbors %>% plot_histogram_custom3()

5.1.2 Class Antropogenic

For the selection of observation wells that are antropogenically influenced, we choose wells from cities that use groundwater for drinking purposes or have shallow groundwater table. A shallow groundwater table causes a groundwater abstraction for constructions leading to an anthropogenically influenced groundwater table.

anthropogenic_obswell_m2 <- obswell_sp %>% 
  dplyr::filter(nn_distance < 1000)

anthropogenic_obswell_m2 %>% 
  as_tibble() %>% 
  write_csv2('./processed_data/training_data_anthropogenic_obswell_m2.csv')

Plot distribution of the distance of antropogenic training data

anthropogenic_obswell_m2@data %>% ggplot() + 
  geom_histogram(
    aes(nn_distance),
    binwidth = 100,
    col = 'black',
    fill = 'red',
    alpha = .5
  )

5.1.3 Class Natural

natural_obswell_m2 <- obswell_sp %>% 
  dplyr::arrange(nn_distance) %>% 
  dplyr::slice(utils::tail(row_number(), 
                           base::nrow(anthropogenic_obswell_m2)))

natural_obswell_m2 %>% 
  as_tibble() %>% 
  write_csv2('./processed_data/training_data_natural_obswell_m2.csv')

Plot distribution of the distance of natural training data

natural_obswell_m2@data %>% ggplot() + 
  geom_histogram(
    aes(nn_distance),
    binwidth = 1000,
    col = 'black',
    fill = 'red',
    alpha = .5
  )